In this notebook a churn data set from a bank is prepared for model building. This data preparation includes
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
py.init_notebook_mode()
import plotly.io as pio
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
pio.templates.default = "plotly_white"
pio.renderers.default = 'colab'
pd.options.plotting.backend = "plotly"
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Load csv file into pandas dataframe
bank_data = pd.read_csv("/content/drive/MyDrive/DataPreparation/churn_data_before_processing.csv")
Show datatypes
bank_data.dtypes
Unnamed: 0 int64 RowNumber int64 CustomerId int64 Surname object CreditScore float64 Geography object Gender object Age float64 Tenure float64 Balance float64 NumOfProducts float64 HasCrCard float64 IsActiveMember float64 EstimatedSalary float64 Exited int64 dtype: object
Visualize head and tail:
bank_data.head()
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42.0 | 2.0 | 0.00 | 1.0 | 1.0 | 1.0 | 101348.88 | 1 |
| 1 | 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41.0 | 1.0 | 83807.86 | 1.0 | 0.0 | 1.0 | 112542.58 | 0 |
| 2 | 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42.0 | 8.0 | 159660.80 | 3.0 | 1.0 | 0.0 | 113931.57 | 1 |
| 3 | 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39.0 | 1.0 | 0.00 | 2.0 | 0.0 | NaN | 93826.63 | 0 |
| 4 | 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43.0 | 2.0 | 125510.82 | 1.0 | 1.0 | 1.0 | 79084.10 | 0 |
bank_data.tail()
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10145 | 3612 | 3613 | 15792151 | Hamilton | 635.0 | Spain | Female | 37.0 | 3.0 | 0.00 | 2.0 | 1.0 | 0.0 | 91086.73 | 0 |
| 10146 | 4437 | 4438 | 15779522 | Efimov | 736.0 | France | Female | 24.0 | 0.0 | 0.00 | 2.0 | 1.0 | 0.0 | 109355.73 | 1 |
| 10147 | 6549 | 6550 | 15792029 | Lee | 620.0 | France | Male | 32.0 | 6.0 | 0.00 | 2.0 | 1.0 | 0.0 | 56139.09 | 0 |
| 10148 | 4590 | 4591 | 15680167 | Thomson | 635.0 | France | Female | 78.0 | 6.0 | 47536.40 | 1.0 | NaN | 1.0 | 119400.08 | 0 |
| 10149 | 8 | 9 | 15792365 | He | 501.0 | France | Male | 44.0 | 4.0 | 142051.07 | 2.0 | 0.0 | 1.0 | 74940.50 | 0 |
Cast categorical variables ('HasCrCard', 'IsActive Member', 'Exited') into actual booleans, drop general duplicates
#cast categorical values into booleans
bank_data['HasCrCard'] = bank_data.loc[bank_data['HasCrCard'].notnull(), 'HasCrCard'].astype(bool)
bank_data['IsActiveMember'] = bank_data.loc[bank_data['IsActiveMember'].notnull(), 'IsActiveMember'].astype(bool)
bank_data['Exited'] = bank_data.loc[bank_data['Exited'].notnull(), 'Exited'].astype(bool)
#identify duplicates
bank_data.duplicated().sum()
104
#drop duplicates
bank_data.drop_duplicates(keep="first", inplace = True)
Identify duplicates in the column "CustomerId"
bank_data.duplicated(["CustomerId"]).sum()
46
Add column with sum of missing values per row
bank_data['nan_sum'] = bank_data.isnull().sum(axis=1)
bank_data
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42.0 | 2.0 | 0.00 | 1.0 | True | True | 101348.88 | True | 0 |
| 1 | 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41.0 | 1.0 | 83807.86 | 1.0 | False | True | 112542.58 | False | 0 |
| 2 | 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42.0 | 8.0 | 159660.80 | 3.0 | True | False | 113931.57 | True | 0 |
| 3 | 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39.0 | 1.0 | 0.00 | 2.0 | False | NaN | 93826.63 | False | 1 |
| 4 | 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43.0 | 2.0 | 125510.82 | 1.0 | True | True | 79084.10 | False | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10131 | 7511 | 7512 | 15686913 | Kung | 757.0 | France | Male | NaN | 0.0 | 0.00 | 1.0 | True | False | 83263.06 | False | 1 |
| 10133 | 8610 | 8611 | 15731553 | Lucas | 730.0 | France | Male | 23.0 | 8.0 | 0.00 | 2.0 | True | False | 183284.53 | False | 0 |
| 10136 | 2997 | 2998 | 15756820 | Fleming | 655.0 | France | Female | 26.0 | NaN | 106198.50 | 1.0 | False | True | 32020.42 | False | 1 |
| 10138 | 3856 | 3857 | 15653306 | Ermakova | 679.0 | Germany | Female | 32.0 | 0.0 | 88335.05 | NaN | False | False | 159584.81 | False | 1 |
| 10148 | 4590 | 4591 | 15680167 | Thomson | 635.0 | France | Female | 78.0 | 6.0 | 47536.40 | 1.0 | NaN | True | 119400.08 | False | 1 |
10046 rows × 16 columns
Check if there are any duplicate customer IDs with the same number of missing values
bank_data[bank_data.duplicated(["CustomerId", "nan_sum"], keep=False) ==True]
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2105 | 2105 | 2106 | 15615096 | Costa | 492.0 | France | Female | NaN | 7.000000 | 0.00 | 2.0 | True | True | 49463.44 | False | 1 |
| 2705 | 2705 | 2706 | 15585835 | Lord | 655.0 | Spain | Female | NaN | 4.000000 | 109783.69 | 2.0 | True | False | 134034.32 | False | 1 |
| 5039 | 5039 | 5040 | 15775490 | Downie | 660.0 | France | Female | 38.0 | 5.000000 | 110570.78 | 2.0 | True | False | 195906.59 | False | 0 |
| 7679 | 7679 | 7680 | 15790689 | Hibbins | 647.0 | Spain | Male | NaN | 9.000000 | 80958.36 | 1.0 | True | True | 128590.73 | False | 1 |
| 10003 | 2105 | 2106 | 15615096 | Costa | 492.0 | France | Female | 31.0 | 7.000000 | NaN | 2.0 | True | True | 49463.44 | False | 1 |
| 10062 | 2705 | 2706 | 15585835 | Lord | 655.0 | Spain | NaN | 34.0 | 4.000000 | 109783.69 | 2.0 | True | False | 134034.32 | False | 1 |
| 10100 | 5039 | 5040 | 15775490 | Downie | 660.0 | France | Female | 38.0 | 40.066513 | 110570.78 | 2.0 | True | False | 195906.59 | False | 0 |
| 10120 | 7679 | 7680 | 15790689 | Hibbins | 647.0 | Spain | Male | 32.0 | 9.000000 | 80958.36 | 1.0 | True | True | NaN | False | 1 |
Three 'CustomerId' duplicates contain one missing value, both entries for '15775490' have zero missing values. This entry was not removed as a general duplicate earlier because both entries have different values in the 'Tenure' column. For those four customer IDs with the same number of missing values we drop the first entry. This way we will drop the second entry for '15775490', which contains an odd 'Tenure' value: '40.066513' (higher than age, probably an error)
bank_data.drop_duplicates(["CustomerId", "nan_sum"], keep='first', inplace=True)
bank_data
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42.0 | 2.0 | 0.00 | 1.0 | True | True | 101348.88 | True | 0 |
| 1 | 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41.0 | 1.0 | 83807.86 | 1.0 | False | True | 112542.58 | False | 0 |
| 2 | 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42.0 | 8.0 | 159660.80 | 3.0 | True | False | 113931.57 | True | 0 |
| 3 | 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39.0 | 1.0 | 0.00 | 2.0 | False | NaN | 93826.63 | False | 1 |
| 4 | 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43.0 | 2.0 | 125510.82 | 1.0 | True | True | 79084.10 | False | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10131 | 7511 | 7512 | 15686913 | Kung | 757.0 | France | Male | NaN | 0.0 | 0.00 | 1.0 | True | False | 83263.06 | False | 1 |
| 10133 | 8610 | 8611 | 15731553 | Lucas | 730.0 | France | Male | 23.0 | 8.0 | 0.00 | 2.0 | True | False | 183284.53 | False | 0 |
| 10136 | 2997 | 2998 | 15756820 | Fleming | 655.0 | France | Female | 26.0 | NaN | 106198.50 | 1.0 | False | True | 32020.42 | False | 1 |
| 10138 | 3856 | 3857 | 15653306 | Ermakova | 679.0 | Germany | Female | 32.0 | 0.0 | 88335.05 | NaN | False | False | 159584.81 | False | 1 |
| 10148 | 4590 | 4591 | 15680167 | Thomson | 635.0 | France | Female | 78.0 | 6.0 | 47536.40 | 1.0 | NaN | True | 119400.08 | False | 1 |
10042 rows × 16 columns
All other 'CustomerId' duplicates differ in the number of missing values, so we can drop the one with more missing values. For this purpose the dataframe is sorted by 'CustomerId' and by the number of missing values
bank_data.sort_values(["CustomerId","nan_sum"], ascending=[True,True], inplace=True)
bank_data[bank_data.duplicated("CustomerId", keep=False) ==True]
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1553 | 1553 | 1554 | 15568106 | NaN | 592.0 | France | Female | 38.0 | 8.0 | 119278.01 | 2.0 | False | True | 19370.73 | False | 1 |
| 10068 | 1553 | 1554 | 15568106 | NaN | 592.0 | France | Female | NaN | 8.0 | 119278.01 | 2.0 | False | True | 19370.73 | False | 2 |
| 658 | 658 | 659 | 15568595 | Fleming | 544.0 | France | Male | 64.0 | 9.0 | 113829.45 | 1.0 | True | True | 124341.49 | False | 0 |
| 10104 | 658 | 659 | 15568595 | Fleming | 544.0 | France | Male | NaN | 9.0 | 113829.45 | 1.0 | True | True | 124341.49 | False | 1 |
| 10049 | 2760 | 2761 | 15591123 | Iredale | 557.0 | Germany | Male | 68.0 | 2.0 | 100194.44 | 1.0 | True | True | 38596.34 | False | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10066 | 1261 | 1262 | 15794870 | Sal | 744.0 | NaN | Male | 38.0 | 6.0 | 73023.17 | 2.0 | True | False | 78770.86 | False | 1 |
| 10106 | 4918 | 4919 | 15796074 | Bruno | 717.0 | France | Female | 36.0 | 2.0 | 99472.76 | 2.0 | True | False | 94274.72 | True | 0 |
| 4918 | 4918 | 4919 | 15796074 | Bruno | 717.0 | France | Female | NaN | 2.0 | 99472.76 | 2.0 | True | False | 94274.72 | True | 1 |
| 10002 | 1431 | 1432 | 15809772 | Glover | 667.0 | France | Male | 48.0 | 2.0 | 0.00 | 1.0 | True | False | 43229.20 | False | 0 |
| 1431 | 1431 | 1432 | 15809772 | Glover | 667.0 | France | Male | 48.0 | 2.0 | 0.00 | 1.0 | NaN | False | 43229.20 | False | 1 |
84 rows × 16 columns
Delete the last entry for each duplicate 'CustomerId', so we keep the entry that has less missing values
bank_data.drop_duplicates("CustomerId", keep='first', inplace=True)
bank_data
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1287 | 1287 | 1288 | 15565701 | Ferri | 698.0 | Spain | NaN | 39.0 | 9.0 | 161993.89 | 1.0 | False | False | 90212.38 | False | 1 |
| 4198 | 4198 | 4199 | 15565706 | Akobundu | 612.0 | Spain | Male | 35.0 | 1.0 | 0.00 | 1.0 | True | True | 83256.26 | True | 0 |
| 7090 | 7090 | 7091 | 15565714 | Cattaneo | 601.0 | France | Male | 47.0 | 1.0 | 64430.06 | 2.0 | False | True | 96517.97 | False | 0 |
| 2020 | 2020 | 2021 | 15565779 | Kent | 627.0 | Germany | Female | 30.0 | 6.0 | 57809.32 | 1.0 | True | False | 188258.49 | False | 0 |
| 3697 | 3697 | 3698 | 15565796 | Docherty | 745.0 | Germany | Male | 48.0 | 10.0 | 96048.55 | 1.0 | True | False | 74510.65 | False | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3411 | 3411 | 3412 | 15815628 | Moysey | 711.0 | France | Female | 37.0 | 8.0 | 113899.92 | 1.0 | False | False | 80215.20 | False | 0 |
| 8271 | 8271 | 8272 | 15815645 | Akhtar | 481.0 | France | NaN | 37.0 | 8.0 | 152303.66 | 2.0 | True | True | 175082.20 | False | 1 |
| 8088 | 8088 | 8089 | 15815656 | Hopkins | 541.0 | Germany | Female | 39.0 | 9.0 | 100116.67 | 1.0 | True | True | 199808.10 | True | 0 |
| 1762 | 1762 | 1763 | 15815660 | Mazzi | 758.0 | France | Female | 34.0 | 1.0 | 154139.45 | 1.0 | True | True | 60728.89 | False | 0 |
| 5502 | 5502 | 5503 | 15815690 | Akabueze | 614.0 | Spain | Female | 40.0 | 3.0 | 113348.50 | 1.0 | True | True | 77789.01 | False | 0 |
10000 rows × 16 columns
Run describe method to show mean, median and quantiles for continuous variables in table ('CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary')
bank_data[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']].describe()
| CreditScore | Age | Tenure | Balance | NumOfProducts | EstimatedSalary | |
|---|---|---|---|---|---|---|
| count | 9982.000000 | 9743.000000 | 9923.000000 | 9.850000e+03 | 9715.000000 | 9957.000000 |
| mean | 650.902076 | 38.947244 | 5.014617 | 7.672267e+04 | 1.528171 | 100117.009341 |
| std | 100.409554 | 10.560561 | 2.905646 | 6.372769e+04 | 0.593393 | 57496.773622 |
| min | 350.000000 | -49.000000 | -5.000000 | 0.000000e+00 | -2.000000 | 11.580000 |
| 25% | 584.000000 | 32.000000 | 3.000000 | 0.000000e+00 | 1.000000 | 51099.560000 |
| 50% | 652.000000 | 37.000000 | 5.000000 | 9.724599e+04 | 1.000000 | 100240.200000 |
| 75% | 718.000000 | 44.000000 | 7.000000 | 1.276414e+05 | 2.000000 | 149399.700000 |
| max | 2924.008649 | 92.000000 | 32.044103 | 1.036826e+06 | 11.177460 | 199992.480000 |
As can be seen with the describe() method the 'Age', 'Tenure' and 'NumOfProducts' columns contain negative values. Aside from that some float values can be found in the 'Tenure' and 'NumofProducts' columns, those need to be removed.
Negative values are removed by using the "abs.()" method:
bank_data['Age'] = bank_data['Age'].abs()
bank_data['Tenure'] = bank_data['Tenure'].abs()
bank_data['NumOfProducts'] = bank_data['NumOfProducts'].abs()
'Tenure' and 'NumOfProducts' columns contain float values as shown by the describe () method, these need to be turned into integers.
#check unique values in "Tenure" column
bank_data['Tenure'].unique()
array([ 9. , 1. , 6. , 10. , 3. ,
8. , nan, 5. , 4. , 0. ,
2. , 7. , 32.04410309])
#convert float values in the 'Tenure' column into integers to avoid having decimal numbers
bank_data['Tenure'] = bank_data['Tenure'][bank_data['Tenure'].notnull()].astype(int)
bank_data['Tenure'].unique()
array([ 9., 1., 6., 10., 3., 8., nan, 5., 4., 0., 2., 7., 32.])
#check unique values for 'NumOfProducts'
bank_data['NumOfProducts'].unique()
array([ 1. , 2. , nan, 3. , 4. ,
11.17746026])
#convert 'NumOfProducts' values into integers to avoid having decimal numbers
bank_data['NumOfProducts'] = bank_data['NumOfProducts'][bank_data['NumOfProducts'].notnull()].astype(int)
bank_data['NumOfProducts'].unique()
array([ 1., 2., nan, 3., 4., 11.])
The describe.() method is run again to ensure that the negative values and float values were properly removed.
bank_data.describe()
| Unnamed: 0 | RowNumber | CustomerId | CreditScore | Age | Tenure | Balance | NumOfProducts | EstimatedSalary | nan_sum | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10000.00000 | 10000.00000 | 1.000000e+04 | 9982.000000 | 9743.000000 | 9923.000000 | 9.850000e+03 | 9715.000000 | 9957.000000 | 10000.000000 |
| mean | 4999.50000 | 5000.50000 | 1.569094e+07 | 650.902076 | 38.962024 | 5.015620 | 7.672267e+04 | 1.529799 | 100117.009341 | 0.154900 |
| std | 2886.89568 | 2886.89568 | 7.193619e+04 | 100.409554 | 10.505895 | 2.903865 | 6.372769e+04 | 0.588837 | 57496.773622 | 0.391563 |
| min | 0.00000 | 1.00000 | 1.556570e+07 | 350.000000 | 18.000000 | 0.000000 | 0.000000e+00 | 1.000000 | 11.580000 | 0.000000 |
| 25% | 2499.75000 | 2500.75000 | 1.562853e+07 | 584.000000 | 32.000000 | 3.000000 | 0.000000e+00 | 1.000000 | 51099.560000 | 0.000000 |
| 50% | 4999.50000 | 5000.50000 | 1.569074e+07 | 652.000000 | 37.000000 | 5.000000 | 9.724599e+04 | 1.000000 | 100240.200000 | 0.000000 |
| 75% | 7499.25000 | 7500.25000 | 1.575323e+07 | 718.000000 | 44.000000 | 7.000000 | 1.276414e+05 | 2.000000 | 149399.700000 | 0.000000 |
| max | 9999.00000 | 10000.00000 | 1.581569e+07 | 2924.008649 | 92.000000 | 32.000000 | 1.036826e+06 | 11.000000 | 199992.480000 | 3.000000 |
The result is saved to a new dataframe.
bank_data2=bank_data.copy()
cols_for_plots = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']
def plot_hist(colname, df=bank_data2, sorted=True):
vc = df[colname].value_counts()
if sorted:
vc = vc.sort_values(ascending=False)
go.Figure(
data=[
go.Bar(
x=vc.index,
y=vc.values
)
],
layout=go.Layout(
title=colname
)
).show()
def plot_box(colname, df=bank_data2):
go.Figure(
data=[
go.Box(
x=df[colname],
name=None
)
],
layout=go.Layout(
title=colname
)
).show()
Draw univariate plots
for col in cols_for_plots:
if bank_data2[col].dtype in ["object", "bool"]:
plot_hist(col)
else:
plot_box(col)
Define bivariate plot functions to see if and how any columns are correlated, show only the most interesting ones
#function to display continuous vs. categorical variables
def plot_cont_vs_cat(x, y, df=bank_data2, cutoff_count_x=10):
fig = go.Figure(
data=[
go.Box(
y=df.loc[df[x] == xval, y],
name=str(xval),
showlegend=False
)
for xval in np.sort(df[x].dropna().unique())
if (df[x] == xval).sum() > cutoff_count_x
],
layout=go.Layout(
title="{} vs {}".format(y, x)
)
)
fig.show()
#fucntion to display categorical vs. categorical variables
def plot_cat_vs_cat(x, y,df=bank_data2, norm=True, cutoff_count_x=10):
p_df = (
df[["CustomerId", x, y]]
.dropna(how="any", axis=0)
.pivot_table(index=y, columns=x, values="CustomerId", aggfunc="count")
)
p_df = p_df.loc[:, p_df.sum(axis=0) > cutoff_count_x]
if norm:
p_df = p_df.div(p_df.sum(axis=0), axis=1) * 100
fig = go.Figure(
data=[
go.Bar(
x=p_df.columns,
y=p_df.loc[cat],
name=str(cat),
showlegend=True
)
for cat in np.sort(p_df.index)
],
layout=go.Layout(
title="{} vs {}".format(y, x),
barmode="stack"
)
)
fig.show()
Draw differences in age distribution by geography:
plot_cont_vs_cat('Geography', 'Age', df=bank_data2)
In Germany the median age is slightly higher than in France and Spain.
Draw balance by region:
plot_cont_vs_cat('Geography', 'Balance', df=bank_data2)
There is a difference in balance by geography, for Germany the distribution is closer to the median than for France and Spain.
Show insights on clients who have exited:
plot_cat_vs_cat('Geography', 'Exited',df=bank_data2, norm=True, cutoff_count_x=10)
The share of clients that have exited is higher for German clients than for clients in France and Spain.
plot_cat_vs_cat('Gender', 'Exited',df=bank_data2, norm=True, cutoff_count_x=10)
The share of clients that have exited is higher for women than for men.
plot_cat_vs_cat('IsActiveMember', 'Exited',df=bank_data2, norm=True, cutoff_count_x=10)
Clients who have not exited are more likely to be an active member than those who have.
As shown in the univariate boxplots some of the continuous variables seem to have outliers ('CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts'). The 'EstimatedSalary' column does not seem to have outliers.
outlier_testing_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
def univariate_outliers(s):
q1 = s.quantile(q=0.25)
q3 = s.quantile(q=0.75)
iqr = q3 - q1
out = (
(s > (q3 + 1.5 * iqr)).astype(int)
+ (s > (q3 + 3 * iqr)).astype(int)
- (s < (q1 - 1.5 * iqr)).astype(int)
- (s < (q1 - 3 * iqr)).astype(int)
)
return out
Create dataframe that shows the number of outliers per column.
bank_data_outl_df = pd.DataFrame({"outl_" + col: univariate_outliers(bank_data2[col]) for col in outlier_testing_cols})
outl_CreditScore_df = pd.DataFrame(data=bank_data_outl_df.loc[bank_data_outl_df["outl_CreditScore"]!=0])
outl_CreditScore_df.groupby(["outl_CreditScore"]).count()
| outl_Age | outl_Tenure | outl_Balance | outl_NumOfProducts | outl_EstimatedSalary | |
|---|---|---|---|---|---|
| outl_CreditScore | |||||
| -1 | 15 | 15 | 15 | 15 | 15 |
| 2 | 2 | 2 | 2 | 2 | 2 |
There are two extreme high outliers and 15 moderate low outliers for 'CreditScore'.
outl_Age_df = pd.DataFrame(data=bank_data_outl_df.loc[bank_data_outl_df["outl_Age"]!=0])
outl_Age_df.groupby(["outl_Age"]).count()
| outl_CreditScore | outl_Tenure | outl_Balance | outl_NumOfProducts | outl_EstimatedSalary | |
|---|---|---|---|---|---|
| outl_Age | |||||
| 1 | 340 | 340 | 340 | 340 | 340 |
| 2 | 12 | 12 | 12 | 12 | 12 |
12 extreme high outliers and 340 moderate high outliers for 'Age'.
outl_Tenure_df = pd.DataFrame(data=bank_data_outl_df.loc[bank_data_outl_df["outl_Tenure"]!=0])
outl_Tenure_df.groupby(["outl_Tenure"]).count()
| outl_CreditScore | outl_Age | outl_Balance | outl_NumOfProducts | outl_EstimatedSalary | |
|---|---|---|---|---|---|
| outl_Tenure | |||||
| 2 | 1 | 1 | 1 | 1 | 1 |
There is one extreme high outlier in the 'Tenure' column.
outl_Balance_df = pd.DataFrame(data=bank_data_outl_df.loc[bank_data_outl_df["outl_Balance"]!=0])
outl_Balance_df.groupby(["outl_Balance"]).count()
| outl_CreditScore | outl_Age | outl_Tenure | outl_NumOfProducts | outl_EstimatedSalary | |
|---|---|---|---|---|---|
| outl_Balance | |||||
| 2 | 2 | 2 | 2 | 2 | 2 |
There is one extreme high outlier in the balance column.
outl_EstimatedSalary_df = pd.DataFrame(data=bank_data_outl_df.loc[bank_data_outl_df["outl_EstimatedSalary"]!=0])
outl_EstimatedSalary_df.groupby(["outl_EstimatedSalary"]).count()
| outl_CreditScore | outl_Age | outl_Tenure | outl_Balance | outl_NumOfProducts | |
|---|---|---|---|---|---|
| outl_EstimatedSalary |
There are no outliers for 'EstimatedSalary'.
The outlier count is added to the general dataset.
bank_data2 = pd.concat([bank_data2, bank_data_outl_df],axis=1)
Create a column with the sum of outliers to later check the number of outliers per row.
bank_data2["sum_outl"] = bank_data2[["outl_CreditScore","outl_Age","outl_Balance",
"outl_EstimatedSalary","outl_NumOfProducts","outl_Tenure"]].sum(axis=1)
Filter dataframe by rows that contain extreme outliers:
extreme_outl = bank_data2.loc[
(bank_data2["outl_CreditScore"]==2)|
(bank_data2["outl_Age"]==2)|
(bank_data2["outl_Tenure"]==2)|
(bank_data2["outl_Balance"]==2)|
(bank_data2["outl_NumOfProducts"]==2)|
(bank_data2["outl_EstimatedSalary"]==2)].copy()
extreme_outl
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | ... | EstimatedSalary | Exited | nan_sum | outl_CreditScore | outl_Age | outl_Tenure | outl_Balance | outl_NumOfProducts | outl_EstimatedSalary | sum_outl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3033 | 3033 | 3034 | 15578006 | Yao | 787.000000 | France | Female | 85.0 | 10.0 | 0.000000e+00 | ... | 116537.96 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 9080 | 9080 | 9081 | 15620443 | Fiorentino | 711.000000 | France | Female | 81.0 | 6.0 | 0.000000e+00 | ... | 72276.24 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 9309 | 9309 | 9310 | 15621644 | Lombardi | 678.000000 | Germany | Male | 83.0 | 6.0 | 1.233566e+05 | ... | 92934.41 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 3531 | 3531 | 3532 | 15653251 | Hickey | 408.000000 | France | Female | 84.0 | 8.0 | 8.787339e+04 | ... | 188484.52 | True | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 6759 | 6759 | 6760 | 15660878 | T'ien | 705.000000 | France | Male | 92.0 | 1.0 | 1.260762e+05 | ... | 34436.83 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 6909 | 6909 | 6910 | 15667002 | Knight | 666.000000 | Spain | Male | 43.0 | 5.0 | 0.000000e+00 | ... | 29346.10 | False | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 |
| 5204 | 5204 | 5205 | 15704231 | Barrett | 430.000000 | France | Female | 33.0 | 32.0 | 0.000000e+00 | ... | 69759.91 | False | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 2 |
| 2135 | 2135 | 2136 | 15704284 | Ekechukwu | 736.000000 | Germany | Male | 57.0 | 9.0 | 9.697802e+05 | ... | 28434.44 | True | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 2 |
| 8493 | 8493 | 8494 | 15727619 | Lock | 2924.008649 | Germany | Female | 46.0 | 9.0 | 1.139097e+05 | ... | 92320.37 | True | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 |
| 7956 | 7956 | 7957 | 15731569 | Hudson | 850.000000 | France | Male | 81.0 | 5.0 | 0.000000e+00 | ... | 44827.47 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 6443 | 6443 | 6444 | 15764927 | Rogova | 753.000000 | France | Male | 92.0 | 3.0 | 1.215133e+05 | ... | 195563.99 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 4931 | 4931 | 4932 | 15772341 | NaN | 682.000000 | Germany | Male | 81.0 | 6.0 | 1.220291e+05 | ... | 50783.88 | False | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 2258 | 2258 | 2259 | 15776631 | Ma | 2158.510025 | France | Female | 36.0 | 5.0 | 1.195401e+05 | ... | 80603.99 | False | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 |
| 5464 | 5464 | 5465 | 15795540 | Reye | 556.000000 | France | Female | 36.0 | 2.0 | 1.036826e+06 | ... | 177670.57 | False | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 2 |
| 3387 | 3387 | 3388 | 15798024 | Lori | 537.000000 | Germany | Male | 84.0 | 8.0 | 9.224234e+04 | ... | 186235.98 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 7526 | 7526 | 7527 | 15800554 | Perry | 850.000000 | France | Female | 81.0 | 1.0 | 0.000000e+00 | ... | 59568.24 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 766 | 766 | 767 | 15810864 | Williamson | 700.000000 | France | Female | 82.0 | 2.0 | 0.000000e+00 | ... | 182055.36 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
| 2458 | 2458 | 2459 | 15813303 | Rearick | 513.000000 | Spain | Male | 88.0 | 10.0 | 0.000000e+00 | ... | 52952.24 | False | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 |
18 rows × 23 columns
Deal with extreme outliers in each variable
extreme_outl["sum_outl"].unique()
array([2])
As the unique() method shows, there is never more than one extreme outlier per row. That suggests that there is no correlation between the extreme outliers.
Extreme outliers 'Age': leave as they are. Since there are also a couple of moderate outliers in the age column the extreme outliers are most likely not an error.
Extreme outliers 'CreditScore': replace with mean. The 'CreditScore' column contains no moderate high outliers, only two extreme ones, which suggets that the two extreme ones are an error.
bank_data2.loc[bank_data2.outl_CreditScore==2,'CreditScore'] = bank_data2["CreditScore"].mean()
Extreme outlier 'Tenure': replace by median. The 'Tenure' column contains no moderate high outliers, only one extreme one, which suggets that the extreme one is an error.
bank_data2.loc[bank_data2.outl_Tenure==2,'Tenure'] = bank_data2["Tenure"].median()
Extreme outliers 'Balance': replace with mean by 'Geography'. The 'Balance' column contains no moderate high outliers, only two extreme ones, so the two extreme ones are probably an error.
balance_replacer = bank_data2.groupby('Geography')['Balance'].mean()
bank_data2.loc[bank_data2.outl_Balance==2, 'Balance'] = [
balance_replacer.loc[x]
for x in bank_data2.loc[bank_data2.outl_Balance==2, 'Geography']]
Extreme outliers 'NumOfProducts': replace with median. The 'NumOfProducts' column contains no moderate high outliers, only two extreme ones, so the two extreme ones are probably an error (similar situation as with 'CreditScore', 'Tenure' and 'Balance').
bank_data2.loc[bank_data2.outl_NumOfProducts==2,'NumOfProducts'] = bank_data2["NumOfProducts"].median()
Check if all extreme outliers in 'CreditScore', 'Tenure', 'Balance' and 'NumOfProducts' were replaced properly:
bank_data2.loc[
(bank_data2["outl_CreditScore"]==2)|
(bank_data2["outl_Tenure"]==2)|
(bank_data2["outl_Balance"]==2)|
(bank_data2["outl_NumOfProducts"]==2)]
| Unnamed: 0 | RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | ... | EstimatedSalary | Exited | nan_sum | outl_CreditScore | outl_Age | outl_Tenure | outl_Balance | outl_NumOfProducts | outl_EstimatedSalary | sum_outl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6909 | 6909 | 6910 | 15667002 | Knight | 666.000000 | Spain | Male | 43.0 | 5.0 | 0.000000 | ... | 29346.10 | False | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 |
| 5204 | 5204 | 5205 | 15704231 | Barrett | 430.000000 | France | Female | 33.0 | 5.0 | 0.000000 | ... | 69759.91 | False | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 2 |
| 2135 | 2135 | 2136 | 15704284 | Ekechukwu | 736.000000 | Germany | Male | 57.0 | 9.0 | 119950.144163 | ... | 28434.44 | True | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 2 |
| 8493 | 8493 | 8494 | 15727619 | Lock | 650.902076 | Germany | Female | 46.0 | 9.0 | 113909.690000 | ... | 92320.37 | True | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 |
| 2258 | 2258 | 2259 | 15776631 | Ma | 650.902076 | France | Female | 36.0 | 5.0 | 119540.150000 | ... | 80603.99 | False | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 |
| 5464 | 5464 | 5465 | 15795540 | Reye | 556.000000 | France | Female | 36.0 | 2.0 | 62629.021915 | ... | 177670.57 | False | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 2 |
6 rows × 23 columns
Save result to new dataframe, without outliers.
bank_data3 = bank_data2.copy()
Now that the outliers were removed some more visualizations will be created to see the churn rate depending on the continuous and categorical variables.
# i. Geography
df_churnr_geo = bank_data3.groupby('Geography').Exited.mean().round(2).reset_index()
df_churnr_geo['Exited']=df_churnr_geo['Exited']*100
df_churnr_geo.drop(index=[2], inplace=True)
df_churnr_geo
| Geography | Exited | |
|---|---|---|
| 0 | France | 16.0 |
| 1 | Germany | 32.0 |
fig = px.bar(
df_churnr_geo,
x='Geography',
y='Exited',
text = 'Exited',
title='Churn Rate by Country',
labels ={'Geography': "Country", 'Exited': "Churn rate (%)"}
)
fig.show()
# ii. Gender
df_churnr_gen = bank_data3.groupby('Gender').Exited.mean().round(2).reset_index()
df_churnr_gen['Exited'] = df_churnr_gen['Exited']*100
df_churnr_gen
| Gender | Exited | |
|---|---|---|
| 0 | Female | 25.0 |
| 1 | Male | 16.0 |
fig2 = px.bar(
df_churnr_gen,
x='Gender',
y='Exited',
text = 'Exited',
title='Churn Rate by Gender',
labels ={'Gender': "Gender", 'Exited': "Churn rate (%)"}
)
fig2.show()
# iii. Tenure
df_churnr_ten = bank_data3.groupby('Tenure').Exited.mean().round(2).reset_index()
df_churnr_ten['Exited'] = df_churnr_ten['Exited']*100
df_churnr_ten
| Tenure | Exited | |
|---|---|---|
| 0 | 0.0 | 23.0 |
| 1 | 1.0 | 22.0 |
| 2 | 2.0 | 19.0 |
| 3 | 3.0 | 21.0 |
| 4 | 4.0 | 20.0 |
| 5 | 5.0 | 21.0 |
| 6 | 6.0 | 20.0 |
| 7 | 7.0 | 17.0 |
| 8 | 8.0 | 19.0 |
| 9 | 9.0 | 22.0 |
| 10 | 10.0 | 21.0 |
fig3 = px.bar(
df_churnr_ten,
x='Tenure',
y='Exited',
text = 'Exited',
title='Churn Rate by Tenure',
labels ={'Tenure': "Tenure", 'Exited': "Churn rate (%)"},
)
fig3.update_xaxes(tick0=0, dtick=1)
fig3.show()
The three churn rate visuals are combined with the country, age and credit score distributions.
#create empty subplots
titles = ['Churn Rate by Country (%)',
'Churn Rate by Gender (%)',
'Churn Rate by Tenure (%)',
'Country Distribution (count)',
'Age Distribution (count)',
'Credit Score Distribution (count)']
fig8 = make_subplots(rows=2,cols=3, subplot_titles=titles)
#add traces
fig8.add_trace(
go.Bar(
x=df_churnr_geo['Geography'],
y=df_churnr_geo['Exited'],
name = 'Churn Rate by Geography (%)',
text=df_churnr_geo['Exited']
),
row=1, col=1
)
fig8.add_trace(
go.Bar(
x=df_churnr_gen['Gender'],
y=df_churnr_gen['Exited'],
text=df_churnr_gen['Exited']
),
row=1, col=2
)
fig8.add_trace(
go.Bar(
x=df_churnr_ten['Tenure'],
y=df_churnr_ten['Exited'],
text=df_churnr_ten['Exited']
),
row=1, col=3
)
df_geo = bank_data2.groupby('Geography').RowNumber.count().reset_index()
df_geo
fig8.add_trace(
go.Bar(
x=df_geo['Geography'],
y=df_geo['RowNumber'],
text=df_geo['RowNumber']
),
row=2, col=1
)
fig8.add_trace(
go.Histogram(
x=bank_data3['Age'],
nbinsx=20
),
row=2, col=2
)
fig8.add_trace(
go.Histogram(
x=bank_data3['CreditScore'],
nbinsx=20
),
row=2, col=3
)
#finetune layout and show figure
fig8.update_layout(height=600, width=1000,font_size=12, showlegend=False)
fig8['layout']['annotations'][0]['font']['size'] = 14
fig8['layout']['annotations'][1]['font']['size'] = 14
fig8['layout']['annotations'][2]['font']['size'] = 14
fig8['layout']['annotations'][3]['font']['size'] = 14
fig8['layout']['annotations'][4]['font']['size'] = 14
fig8['layout']['annotations'][5]['font']['size'] = 14
fig8.update_xaxes(tick0=0, dtick=1, row=1, col=3)
fig8.show()
Check what is the maximum number of missing values per row and how many rows fulfill this condition
grouped = bank_data3[['CustomerId', 'nan_sum']].groupby('nan_sum')
grouped.count()
| CustomerId | |
|---|---|
| nan_sum | |
| 0 | 8558 |
| 1 | 1340 |
| 2 | 97 |
| 3 | 5 |
The maximum number of missing values per row is the three, the rest of the rows have two or fewer missing values. Since no row has a high number of missing values no entire rows will be dropped.
Replace missing 'Surname' and 'Geography' values with "N/A". The missing values in the "Gender" column will not be replaced since this column will be OneHot encoded anyways. The 'Geography' column will also be OneHot encoded, but here we will replace the missing values temporarily so that this column can be used to categorize mean values, such as 'Age' by 'Geography'
bank_data3['Surname'].fillna("N/A", inplace=True)
bank_data3['Geography'].fillna("N/A", inplace=True)
Replace missing boolean values with "False"
bank_data3['HasCrCard'].fillna(False, inplace=True)
bank_data3['IsActiveMember'].fillna(False, inplace=True)
Replace'EstimatedSalary' with mean
bank_data3.loc[bank_data3['EstimatedSalary'].isna(), 'EstimatedSalary'] = bank_data3['EstimatedSalary'].mean()
Replace missing 'Tenure' and 'NumOfProducts' values with median
bank_data3.loc[bank_data3['Tenure'].isna(), 'Tenure'] = bank_data3['Tenure'].median()
bank_data3.loc[bank_data3['NumOfProducts'].isna(), 'NumOfProducts'] = bank_data3['NumOfProducts'].median()
Replace missing'CreditScore' with mean depending on 'HasCrCard'
creditscore_replacer = bank_data3.groupby('HasCrCard')['CreditScore'].mean()
bank_data3.loc[bank_data3['CreditScore'].isna(), 'CreditScore'] = [
creditscore_replacer.loc[x]
for x in bank_data3.loc[bank_data3['CreditScore'].isna(), 'HasCrCard']]
Replace missing 'Age' values with median by 'Geography'
age_replacer = bank_data3.groupby('Geography')['Age'].median().round()
bank_data3.loc[bank_data3['Age'].isna(), 'Age'] = [
age_replacer.loc[x]
for x in bank_data3.loc[bank_data3['Age'].isna(), 'Geography']]
Replace missing 'Balance' values with mean by 'Geography'
balance_replacer = bank_data3.groupby('Geography')['Balance'].mean()
bank_data3.loc[bank_data3['Balance'].isna(), 'Balance'] = [
balance_replacer.loc[x]
for x in bank_data3.loc[bank_data3['Balance'].isna(), 'Geography']]
Remove columns that are not needed any longer: outliers and 'sum_na'
bank_data3 = bank_data3[[x for x in bank_data3.columns if "outl" not in x]]
bank_data3.drop(columns='nan_sum', inplace=True)
<ipython-input-64-91ca39dd62c2>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Copy to new dataframe without missing values
bank_data4 = bank_data3.copy()
One Hot Encoding for categorical variables ('Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited')
my_ohe = OneHotEncoder(sparse_output=False)
geography_OHE = my_ohe.fit_transform(bank_data4[["Geography"]])
geography_df = pd.DataFrame(geography_OHE, columns=my_ohe.categories_[0],index=bank_data4.index).astype(int)
gender_OHE = my_ohe.fit_transform(bank_data4[["Gender"]])
gender_df = pd.DataFrame(gender_OHE, columns=my_ohe.categories_[0], index=bank_data4.index).astype(int)
gender_df = gender_df.drop(gender_df.columns[[-1]], axis=1)
hascrcard_OHE = my_ohe.fit_transform(bank_data4[["HasCrCard"]])
hascrcard_df = pd.DataFrame(hascrcard_OHE, columns=['HasCrCard False', 'HasCrCard True'],index=bank_data4.index).astype(int)
isactivemember_OHE = my_ohe.fit_transform(bank_data4[["IsActiveMember"]])
isactivemember_df = pd.DataFrame(isactivemember_OHE, columns=['IsActiveMember False', 'IsActiveMember True'],index=bank_data4.index).astype(int)
exited_OHE = my_ohe.fit_transform(bank_data4[["Exited"]])
exited_df = pd.DataFrame(exited_OHE, columns=['Exited False', 'Exited True'],index=bank_data4.index).astype(int)
Concat one hot encoder dataframes to original dataframe and delete columns that are not needed any longer
bank_data4 = pd.concat([bank_data4, geography_df, gender_df, hascrcard_df, isactivemember_df, exited_df],axis=1).drop(
['Geography','Gender','HasCrCard','IsActiveMember','Exited', 'N/A', 'HasCrCard False', 'IsActiveMember False','Exited False'], axis=1)
Save to new dataframe with encoded categorical variables
bank_data5 = bank_data4.copy()
Scale continuous variables ('CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary'). MinMaxScaler is used so that we will have similar values as for the categorical variables, which are either 0 or 1.
mmsc = MinMaxScaler()
mm_CreditScore = mmsc.fit_transform(bank_data5[["CreditScore"]]).flatten()
mm_Age= mmsc.fit_transform(bank_data5[["Age"]]).flatten()
mm_Tenure = mmsc.fit_transform(bank_data5[["Tenure"]]).flatten()
mm_Balance = mmsc.fit_transform(bank_data5[["Balance"]]).flatten()
mm_NumOfProducts = mmsc.fit_transform(bank_data5[["NumOfProducts"]]).flatten()
mm_EstimatedSalary = mmsc.fit_transform(bank_data5[["EstimatedSalary"]]).flatten()
The scaled variables are added to the dataframe
bank_data5['CreditScoreScaled'] = mm_CreditScore
bank_data5['AgeScaled'] = mm_Age
bank_data5['TenureScaled'] = mm_Tenure
bank_data5['BalanceScaled'] = mm_Balance
bank_data5['NumOfProductsScaled'] = mm_NumOfProducts
bank_data5['EstimatedSalaryScaled'] = mm_EstimatedSalary
The dataframe is saved as csv file
bank_data5.to_csv("/content/drive/MyDrive/DataPreparation/churn_data_after_processing.csv", index=False)
bank_data5.columns
Index(['Unnamed: 0', 'RowNumber', 'CustomerId', 'Surname', 'CreditScore',
'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary',
'France', 'Germany', 'Spain', 'Female', 'Male', 'HasCrCard True',
'IsActiveMember True', 'Exited True', 'CreditScoreScaled', 'AgeScaled',
'TenureScaled', 'BalanceScaled', 'NumOfProductsScaled',
'EstimatedSalaryScaled'],
dtype='object')
Create correlation matrix with categorical and continuous variables
df_corr = bank_data5[
['CreditScore',
'Age',
'Tenure',
'Balance',
'NumOfProducts',
'HasCrCard True',
'IsActiveMember True',
'EstimatedSalary',
'Exited True',
'France',
'Germany',
'Spain',
'Female',
'Male']].corr().round(3).reset_index()
df_corr.head()
| index | CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard True | IsActiveMember True | EstimatedSalary | Exited True | France | Germany | Spain | Female | Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CreditScore | 1.000 | -0.005 | 0.000 | 0.005 | 0.006 | -0.009 | 0.026 | -0.000 | -0.028 | -0.010 | 0.006 | 0.002 | 0.003 | 0.000 |
| 1 | Age | -0.005 | 1.000 | -0.011 | 0.033 | -0.031 | -0.011 | 0.084 | -0.008 | 0.282 | -0.040 | 0.048 | -0.003 | 0.028 | -0.025 |
| 2 | Tenure | 0.000 | -0.011 | 1.000 | -0.011 | 0.018 | 0.021 | -0.029 | 0.009 | -0.014 | -0.003 | -0.001 | 0.004 | -0.014 | 0.015 |
| 3 | Balance | 0.005 | 0.033 | -0.011 | 1.000 | -0.290 | -0.016 | -0.010 | 0.012 | 0.119 | -0.224 | 0.396 | -0.135 | -0.012 | 0.014 |
| 4 | NumOfProducts | 0.006 | -0.031 | 0.018 | -0.290 | 1.000 | 0.002 | 0.011 | 0.012 | -0.049 | -0.002 | -0.012 | 0.007 | 0.015 | -0.020 |
Drop row with correlation between 'Exited True' and 'Exited True'
df_corr = df_corr[df_corr['Exited True']!=1]
df_corr.head()
| index | CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard True | IsActiveMember True | EstimatedSalary | Exited True | France | Germany | Spain | Female | Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CreditScore | 1.000 | -0.005 | 0.000 | 0.005 | 0.006 | -0.009 | 0.026 | -0.000 | -0.028 | -0.010 | 0.006 | 0.002 | 0.003 | 0.000 |
| 1 | Age | -0.005 | 1.000 | -0.011 | 0.033 | -0.031 | -0.011 | 0.084 | -0.008 | 0.282 | -0.040 | 0.048 | -0.003 | 0.028 | -0.025 |
| 2 | Tenure | 0.000 | -0.011 | 1.000 | -0.011 | 0.018 | 0.021 | -0.029 | 0.009 | -0.014 | -0.003 | -0.001 | 0.004 | -0.014 | 0.015 |
| 3 | Balance | 0.005 | 0.033 | -0.011 | 1.000 | -0.290 | -0.016 | -0.010 | 0.012 | 0.119 | -0.224 | 0.396 | -0.135 | -0.012 | 0.014 |
| 4 | NumOfProducts | 0.006 | -0.031 | 0.018 | -0.290 | 1.000 | 0.002 | 0.011 | 0.012 | -0.049 | -0.002 | -0.012 | 0.007 | 0.015 | -0.020 |
Plot correlation matrix
fig9= go.Figure()
fig9.add_trace(
go.Bar(
x=df_corr['index'],
y=df_corr['Exited True'],
),
)
fig9.update_layout(
title="Feature Correlation Matrix for 'Exited'",
xaxis_title="Feature",
yaxis_title="Correlation")
fig9.show()
The bars are ordered so that the feature with the highest correlation is the first bar
df_sorted = df_corr.sort_values(by=['Exited True'], ascending=False)
categories_ordered = list(df_sorted['index'])
fig9.update_xaxes(categoryorder= 'array', categoryarray= categories_ordered)
fig9.show()
The correlation value is added to the top of the bar
fig9.update_traces(texttemplate="%{y}")
fig9.show()
A line showing the average correlation is added to the figure.
average_corr = df_corr['Exited True'].mean().round(3)
df_corr['Average'] = average_corr
df_corr.head()
| index | CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard True | IsActiveMember True | EstimatedSalary | Exited True | France | Germany | Spain | Female | Male | Average | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CreditScore | 1.000 | -0.005 | 0.000 | 0.005 | 0.006 | -0.009 | 0.026 | -0.000 | -0.028 | -0.010 | 0.006 | 0.002 | 0.003 | 0.000 | 0.014 |
| 1 | Age | -0.005 | 1.000 | -0.011 | 0.033 | -0.031 | -0.011 | 0.084 | -0.008 | 0.282 | -0.040 | 0.048 | -0.003 | 0.028 | -0.025 | 0.014 |
| 2 | Tenure | 0.000 | -0.011 | 1.000 | -0.011 | 0.018 | 0.021 | -0.029 | 0.009 | -0.014 | -0.003 | -0.001 | 0.004 | -0.014 | 0.015 | 0.014 |
| 3 | Balance | 0.005 | 0.033 | -0.011 | 1.000 | -0.290 | -0.016 | -0.010 | 0.012 | 0.119 | -0.224 | 0.396 | -0.135 | -0.012 | 0.014 | 0.014 |
| 4 | NumOfProducts | 0.006 | -0.031 | 0.018 | -0.290 | 1.000 | 0.002 | 0.011 | 0.012 | -0.049 | -0.002 | -0.012 | 0.007 | 0.015 | -0.020 | 0.014 |
#add line and annotation to the plot
fig9.add_trace(go.Scatter(
x=df_corr['index'],
y=df_corr['Average'],
mode='lines'
))
fig9.add_annotation(
x='NumOfProducts', y=0.014,
text="Average correlation 0.014",
showarrow=True,
arrowhead=1
)
#finetune layout and show graph
fig9.update_layout(showlegend=False)
fig9.show()